{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 08 Interquartile range (IQR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<iframe width=\"700\" height=\"400\" src=\"https://www.youtube.com/embed/qLYYHWYr8xI/\" frameborder=\"0\" allowfullscreen></iframe>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%html\n",
    "<iframe width=\"700\" height=\"400\" src=\"https://www.youtube.com/embed/qLYYHWYr8xI/\" frameborder=\"0\" allowfullscreen></iframe>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas import Series, DataFrame\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy import stats"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[khanacademy](https://www.khanacademy.org/math/ap-statistics/summarizing-quantitative-data-ap/measuring-spread-quantitative/v/calculating-interquartile-range-iqr?modal=1)\n",
    "[pandas.DataFrame.quantile](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.quantile.html)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![Interquartile range (IQR) fig 1](./imgs/03-08-01.png)![Interquartile range (IQR) fig 2](./imgs/03-08-02.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_data = {'x': [4,4,6,7,10,11,12,14,15]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_data = {'y': [7, 9, 9, 10, 10, 11, 12, 12, 14]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_df = DataFrame(x_data)\n",
    "y_df = DataFrame(y_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>9.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>9.222222</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>4.146618</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>4.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>6.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>10.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>12.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>15.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               x\n",
       "count   9.000000\n",
       "mean    9.222222\n",
       "std     4.146618\n",
       "min     4.000000\n",
       "25%     6.000000\n",
       "50%    10.000000\n",
       "75%    12.000000\n",
       "max    15.000000"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>9.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>10.444444</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.068279</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>7.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>9.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>10.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>12.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>14.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               y\n",
       "count   9.000000\n",
       "mean   10.444444\n",
       "std     2.068279\n",
       "min     7.000000\n",
       "25%     9.000000\n",
       "50%    10.000000\n",
       "75%    12.000000\n",
       "max    14.000000"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "x    12.0\n",
      "Name: 0.75, dtype: float64\n",
      "x    12\n",
      "Name: 0.75, dtype: int64\n",
      "y    12.0\n",
      "Name: 0.75, dtype: float64\n",
      "y    12\n",
      "Name: 0.75, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(x_df.quantile(q=0.75))\n",
    "print(x_df.quantile(q=0.75, interpolation='nearest'))\n",
    "\n",
    "print(y_df.quantile(q=0.75))\n",
    "print(y_df.quantile(q=0.75, interpolation='nearest'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6.0\n",
      "3.0\n"
     ]
    }
   ],
   "source": [
    "print(stats.iqr(x_df))\n",
    "print(stats.iqr(y_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_df['Rank'] = x_df.index + 1\n",
    "x_df['Empirical_CDF'] = x_df['Rank'] / x_df.shape[0]\n",
    "x_q_25 = x_df.x[x_df['Empirical_CDF']>=0.25].reset_index(drop=True)[0]\n",
    "x_q_50 = x_df.x[x_df['Empirical_CDF']>=0.50].reset_index(drop=True)[0]\n",
    "x_q_75 = x_df.x[x_df['Empirical_CDF']>=0.75].reset_index(drop=True)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_df['Rank'] = y_df.index + 1\n",
    "y_df['Empirical_CDF'] = y_df['Rank'] / y_df.shape[0]\n",
    "y_q_25 = y_df.y[y_df['Empirical_CDF']>=0.25].reset_index(drop=True)[0]\n",
    "y_q_50 = y_df.y[y_df['Empirical_CDF']>=0.50].reset_index(drop=True)[0]\n",
    "y_q_75 = y_df.y[y_df['Empirical_CDF']>=0.75].reset_index(drop=True)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "q 25 6 q 50 10 q 75 12 => iqr = 6\n",
      "q 25 9 q 50 10 q 75 12 => iqr = 3\n"
     ]
    }
   ],
   "source": [
    "print(f'q 25 {x_q_25} q 50 {x_q_50} q 75 {x_q_75} => iqr = {x_q_75 - x_q_25}')\n",
    "print(f'q 25 {y_q_25} q 50 {y_q_50} q 75 {y_q_75} => iqr = {y_q_75 - y_q_25}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}